/* This dofile is to clean and match tvet admission data for grade 12 students with the current database */

// import dataset, clean and separate for grade 10 and grade 12
import excel $noncbt, clear firstrow 
drop M N O
ren nac_status_name app_status
tempfile file1
save `file1', replace 

import excel $cbt, clear firstrow 
g a_cbt = 1
lab var a_cbt "whether the data is from CBT-trainees" 
append using `file1'

ren app_name name
ren cid_no citizenid
ren dob dateofbirth

rename *, lower   

g i_driving_tvetadmin = 0
replace i_driving_tvetadmin = 1 if strpos(institute_name, "Driving")>0
lab var i_driving_tvetadmin "0/1 whether the institute is driving center in the tvet admin data"

g i_driving_private = 0
replace i_driving_private = 1 if i_driving_tvetadmin == 1 | a_cbt == 1
lab var i_driving_private "0/1 whether the institute is driving center or private institute in the tvet admin data"



replace citizenid = "" if length(citizenid)~=11
replace citizenid = "" if citizenid == "UNDER PROCE"
destring citizenid, replace

tostring citizenid, replace format("%20.0f")

replace institute_name = trim(itrim(lower(institute_name)))
tab institute_name

replace name = trim(itrim(lower(name)))
replace name = subinstr(name," ","",.)  
replace name = subinstr(name,",","",.)   //Removes comma (,)
replace name = subinstr(name,"'","",.)   //Removes apostrophe (')
replace name = subinstr(name,".","",.)   //Removes dot (.) 
replace name = subinstr(name,"/","",.)   //Removes slash (/)
replace name = subinstr(name,"-","",.)   //Removes dash (-)
replace name = subinstr(name,"=","",.)   //Removes dash (-)
replace name = subinstr(name,"(","",.)   //Removes opening parentheses
replace name = subinstr(name,")","",.)   //Removes closing parentheses

gen dateofbirth1 = date(dateofbirth, "MDY")
format dateofbirth1 %td
g b_month = month(dateofbirth1)
g b_day = day(dateofbirth1)
g b_year = year(dateofbirth1)

g sex = 1 if gender == "M"
replace sex = 2 if gender == "F"
replace sex = 3 if gender == "\N"

drop dateofbirth dateofbirth1

g double dateofbirth = b_year*10^4 + b_month*10^2 + b_day
format dateofbirth %20.0f 
tostring dateofbirth, replace 

format course_start_date %td
g app_year = year(course_start_date)
lab var app_year "year of application and start of course"

g a_admtvet = "Yes"
replace a_admtvet = "No" if app_status == "Not Selected"
replace a_admtvet = "N/A" if app_status == "\N"
lab var a_admtvet "got admission to TVET (administrative data)"

g a_adm_tvet = regexm(a_admtvet, "Yes")
lab var a_adm_tvet "admission to TVET (administrative data)"

g a_apptvet = "Yes"
lab var a_apptvet "application to TVET (administrative data)"

g a_app_tvet = regexm(a_apptvet, "Yes")
lab var a_app_tvet "application to TVET (administrative data)"

g a_adm_tvet_dr = a_adm_tvet
replace a_adm_tvet_dr = . if i_driving_tvetadmin == 1
lab var a_adm_tvet_dr "admission to TVET (administrative data w/o driving centers)"

g a_adm_tvet_drpr = a_adm_tvet
replace a_adm_tvet_drpr = . if i_driving_private == 1
lab var a_adm_tvet_drpr "admission to TVET (administrative data w/o driving centers and private institutes)"


g a_app_tvet_dr = a_app_tvet
replace a_app_tvet_dr = . if i_driving_tvetadmin == 1
lab var a_app_tvet_dr "application to TVET (administrative data w/o driving centers)"

g a_app_tvet_drpr = a_app_tvet
replace a_app_tvet_drpr = . if i_driving_private == 1
lab var a_app_tvet_drpr "application to TVET (administrative data w/o driving centers and private institutes)"

// keep data from 2022 
keep if app_year == 2022

// for students who applied several times, keep the outcomes of the latest application
sort institute_name name citizenid sex a_cbt course_start_date
duplicates tag institute_name name citizenid sex , g(temp)
*br institute_name name citizenid sex if temp~=0

g dup = 0
forval i=1/3 {
replace dup = 1 if institute_name == institute_name[_n+`i'] & name == name[_n+`i'] & sex == sex[_n+`i'] & citizenid==citizenid[_n+`i']
}

keep if dup == 0
drop temp

foreach variable in name citizenid dateofbirth sex b_month b_day b_year{
	rename `variable' `variable'_t 
}

keep if qualification_name == "Class XII"
g tvetindexnumber = _n
save "$temp/tvetadm_grade12.dta", replace 


********************************************************************************
*
*			MERGE WITH GRADE 12 SURVEY DATA 
*
********************************************************************************

// STEP 1: PERFECT MATCHING ON CITIZEN ID + DOUBLE CHECK NAME & BIRTHDAY
********************************************************************************
gl criteria citizenid  

* open the tvet choice data 
use "$temp/tvetadm_grade12.dta", clear 
rename citizenid_t citizenid 

duplicates tag $criteria, g(temp)
tab temp

* obtain the subset of unique citizenid
preserve 
keep if temp==0
drop temp
save $temp/tvet_step1u, replace 
restore 

* keep the duplicated for next round match
keep if temp~=0
drop temp
cap rename citizenid citizenid_t
save $temp/tvet_step1d, replace


* open the survey data 
use "$temp/tvet_analysis.dta", clear
replace name = subinstr(name," ","",.)  
replace sex = 2 if sex == 0 
format citizenid %20s

generate str citizenid_st = citizenid
replace citizenid = ""
compress citizenid
replace citizenid = citizenid_st
drop citizenid_st
describe citizenid

save "$temp/data12.dta", replace 


use "$temp/data12.dta", clear

duplicates tag $criteria, g(temp)

* obtain the subset of unique citizenid
preserve 
keep if temp==0
drop temp
save $temp/data12_step1u, replace 
restore 

* keep the duplicated for next round match
keep if temp~=0
drop temp
rename citizenid citizenid_t
save $temp/data12_step1d, replace

* merging two unique data together using fully matched citizenid ID & check fuzzy name 
use $temp/tvet_step1u, clear
merge 1:1 $criteria using $temp/data12_step1u
matchit name_t name, g(namescore)
matchit dateofbirth_t dateofbirth, g(bscore)
*br namescore bscore if _merge==3 
sum namescore if _merge==3
sum bscore if _merge==3

* correct if names are not matched, leave for the next round of matching
replace _merge = . if (namescore <= 0.5 & bscore < 0.8) & _merge == 3

* keep matched data 
preserve 
keep if _merge==3 
keep tvetindexnumber uniqueid 
unique tvetindexnumber
save $temp/tvet_step1m, replace 
restore 

* keep unmatched data from score database, merge with the duplicated from previous step
preserve 
keep if _merge==1 | _merge==.
keep tvetindexnumber 
merge 1:1 tvetindexnumber using $temp/tvetadm_grade12.dta, keep(matched) nogen 
append using $temp/tvet_step1d
save $temp/tvet_step2b, replace 
restore 

preserve 
keep if _merge==2 | _merge==.
keep responseid_bl schoolname_bl 
merge 1:1 responseid_bl schoolname_bl using $temp/data12.dta, keep(matched) nogen 
append using $temp/data12_step1d
save $temp/data12_step2b, replace 
restore 


// STEP 2: PERFECT MATCHING ON NAME SEX DOB  
********************************************************************************
gl criteria name sex dateofbirth

* open the tvet choice data 
use $temp/tvet_step2b.dta, clear 
ren (name_t sex_t dateofbirth_t) (name sex dateofbirth) 
duplicates tag $criteria, g(temp)

* obtain the subset of unique citizenid
preserve 
keep if temp==0
drop temp
save $temp/tvet_step2u, replace 
restore 

* keep the duplicated for next round match
keep if temp~=0
drop temp
cap rename citizenid citizenid_t
save $temp/tvet_step2d, replace


* open the survey data 
use $temp/data12_step2b.dta, clear
rename phone phone_b

duplicates tag $criteria, g(temp)

* obtain the subset of unique citizenid
preserve 
keep if temp==0
drop temp
save $temp/data12_step2u, replace 
restore 

* keep the duplicated for next round match
keep if temp~=0
drop temp
save $temp/data12_step2d, replace


* merging two unique data together using fully matched name sex dob & check fuzzy CID 
use $temp/tvet_step2u, clear
merge 1:1 $criteria using $temp/data12_step2u
matchit citizenid_t citizenid, g(cidscore)
*br cidscore if _merge==3
sum cidscore if _merge==3

replace _merge = . if (cidscore < 0.6 | missing(citizenid)) & _merge == 3 

* keep matched data 
preserve 
keep if _merge==3 
keep tvetindexnumber uniqueid 
unique tvetindexnumber 
save $temp/tvet_step2m, replace 
restore 

* keep unmatched data from score database, merge with the duplicated from previous step
preserve 
keep if _merge==1 | _merge==.
keep tvetindexnumber 
merge 1:1 tvetindexnumber using $temp/tvetadm_grade12.dta, keep(matched) nogen 
append using $temp/tvet_step2d
save $temp/tvet_step3b, replace 
restore 

preserve 
keep if _merge==2 | _merge==.
keep responseid_bl schoolname_bl 
merge 1:1 responseid_bl schoolname_bl using $temp/data12.dta, keep(matched) nogen 
append using $temp/data12_step2d
save $temp/data12_step3b, replace 
restore  


// combine data together 
********************************************************************************
clear 
append using $temp/tvet_step1m
g a_tvet_match_criteria = "perfect(CID) + fuzzy(name/dob)"

append using $temp/tvet_step2m
replace a_tvet_match_criteria = "perfect(name-sex-dob) + fuzzy(CID)" if missing(a_tvet_match_criteria)

count

tab a_tvet_match_criteria

merge 1:1 tvetindexnumber using "$temp/tvetadm_grade12.dta", keep(matched) keepusing(a_* inst*) nogen 
drop tvetindexnumber
g a_adm_data = 1 
lab var a_adm_data "1/0 tvet admission data available"

save "$temp/grade12_analysis_tvetadm.dta", replace 

// separate cbt and non-cbt
********************************************************************************
use "$temp/grade12_analysis_tvetadm.dta", clear

preserve
keep if a_cbt == 1
save "$temp/grade12_analysis_tvetadm_cbt.dta", replace 
restore

preserve 
drop if a_cbt == 1
save "$temp/grade12_analysis_tvetadm_noncbt.dta", replace 
restore
